Conditional random fields (CRFs) are a class of statistical modeling methods often applied in pattern recognition and machine learning and used for structured prediction. Whereas a classifier predicts a label for a single sample without considering "neighbouring" samples, a CRF can take context into account. To do so, the predictions are modelled as a graphical model, which represents the presence of dependencies between the predictions. What kind of graph is used depends on the application.
examples where CRFs are used are: labeling or parsing of sequential data for natural language processing or biological sequences:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
# Sample data
# Each sentence is represented as a list of dictionaries, where each dictionary has 'word' and 'label' keys.
# Labels can be 'B-PER' (beginning of a person entity), 'I-PER' (inside a person entity), 'O' (outside entity).
train_data = [
[{'word': 'John', 'label': 'B-PER'}, {'word': 'Doe', 'label': 'I-PER'}, {'word': 'works', 'label': 'O'}],
[{'word': 'Alice', 'label': 'B-PER'}, {'word': 'Smith', 'label': 'I-PER'}, {'word': 'is', 'label': 'O'}, {'word': 'an', 'label': 'O'}, {'word': 'engineer', 'label': 'O'}]
]
test_data = [
[{'word': 'David', 'label': 'B-PER'}, {'word': 'Brown', 'label': 'I-PER'}, {'word': 'is', 'label': 'O'}, {'word': 'a', 'label': 'O'}, {'word': 'doctor', 'label': 'O'}]
]
# Feature extraction function
def word2features(sent, i):
word = sent[i]['word']
features = {
'bias': 1.0,
'word.lower()': word.lower(),
}
if i > 0:
features.update({
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
})
else:
features['BOS'] = True
if i < len(sent) - 1:
features.update({
'word[:3]': word[:3],
'word[:2]': word[:2],
})
else:
features['EOS'] = True
return features
# Convert data into features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [token['label'] for token in sent]
X_train = [sent2features(sent) for sent in train_data]
y_train = [sent2labels(sent) for sent in train_data]
X_test = [sent2features(sent) for sent in test_data]
y_test = [sent2labels(sent) for sent in test_data]
# Create and train CRF model
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
# Make predictions
y_pred = crf.predict(X_test)
# Evaluate the model
report = metrics.flat_classification_report(y_test, y_pred)
print(report)